Big Data Analytical Method Final Project

Dev. Environment & Software

FAQ

#R889

#R889


Library Import

library(xml2)
library(jsonlite)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(purrr)
library(data.table)
library(plotly)
library(lubridate) # time formatting
library(zoo) # time formatting

Data Import

typhoon_all <- read.csv("https://raw.githubusercontent.com/jason19970210/BigDataAnalyticalMethods/master/Final/Data/typhoon/typhoon_web_table.csv",stringsAsFactors = F)
eq_xml_url_base <- paste("https://raw.githubusercontent.com/jason19970210/BigDataAnalyticalMethods/master/Final/Data/earthquake/CWB-EQ-Catalog-%d","xml",sep = ".")
sea_level <- fromJSON("https://opendata.cwb.gov.tw/fileapi/v1/opendataapi/C-B0048-001?Authorization=CWB-64CBB768-EE64-4FD2-AED5-0A68D1A48B79&downloadType=WEB&format=JSON")
seatemp <- fromJSON("https://opendata.cwb.gov.tw/fileapi/v1/opendataapi/C-B0050-001?Authorization=CWB-64CBB768-EE64-4FD2-AED5-0A68D1A48B79&downloadType=WEB&format=JSON")

Data 1st Processing (Make Data Frame)

Earthquake data
map_df(2012:2017, function(i){
  xml_url <- read_xml(sprintf(eq_xml_url_base,i))
  eqinfo <- xml_find_all(xml_url, ".//earthquakeinfo")
  
  #tibble::tibble
  data.frame(originTime = xml_text(xml_find_first(eqinfo, ".//originTime")),
             epicenterLon = xml_text(xml_find_first(eqinfo, ".//epicenterLon")),
             epicenterLat = xml_text(xml_find_first(eqinfo, ".//epicenterLat")),
             depth = xml_text(xml_find_first(eqinfo, ".//depth")),
             magnitudeValue = xml_text(xml_find_first(eqinfo, ".//magnitudeValue")),
             gap = xml_text(xml_find_first(eqinfo, "./gap")),
             rms = xml_text(xml_find_first(eqinfo, "./rms")),
             erh = xml_text(xml_find_first(eqinfo, "./erh")),
             erz = xml_text(xml_find_first(eqinfo, "./erz")),
             quality = xml_text(xml_find_first(eqinfo, "./quality")),
             
             stringsAsFactors = FALSE   #doesnt have this parameter in tibble
  ) #end of data.frame
}) -> eq_df
Sea Level Data
sea_level <- sea_level$Cwbopendata$dataset$location

sea_level <- data.table(sea_level)
sea_level[, number := 1:nrow(sea_level)]
sea_level[, yyyymm := ItemValue[[.N]][[1]], by = number]
sea_level[, MeanSeaLevel := ItemValue[[.N]][2], by = number]
sea_level[, MeanSeaLevel_unit := ItemUnit[[.N]][1], by = number]
sea_level[, HHWL := ItemValue[[.N]][3], by = number]
sea_level[, HHWL_unit := ItemUnit[[.N]][2], by = number]
sea_level[, LLWL := ItemValue[[.N]][4], by = number]
sea_level[, LLWL_unit := ItemUnit[[.N]][3], by = number]

sea_level <- select(sea_level, SiteName, SiteId, yyyymm, MeanSeaLevel, MeanSeaLevel_unit, HHWL, HHWL_unit, LLWL, LLWL_unit)
Sea Temperture Data
seatemp <- seatemp$Cwbopendata$dataset$location

sea_test2 <- data.frame(obsrtime = NA,
                        MonthAvg = NA,
                        MonthHigh = NA,
                        MonthHighTime = NA,
                       MonthLow = NA,
                       MonthLowTime = NA,
                       LocationName = NA,
                       StationID = NA,
                       lon = NA,
                       lat = NA)


for (j in 1:nrow(seatemp)){
  
  sea_test <- data.frame(obsrtime = NA,
                         MonthAvg = NA,
                         MonthHigh = NA,
                         MonthHighTime = NA,
                         MonthLow = NA,
                         MonthLowTime = NA)
  
  for (i in 1:length(seatemp$time[[j]]$obsrtime)) {
    sea_temp  <- c(seatemp$time[[j]]$obsrtime[i], seatemp$time[[j]]$weatherElement$elementValue[[i]]$vlaue)
    sea_test <- rbind(sea_test, sea_temp)
  }
  sea_test <- sea_test[-1,]
  sea_test$LocationName <- seatemp$LocationName[j]
  sea_test$StationID <- seatemp$StationID[j]
  sea_test$lon <- seatemp$lon[j]
  sea_test$lat <- seatemp$lat[j]
  
  sea_test2 <- rbind(sea_test2, sea_test)
}

sea_test2 <- sea_test2[-1,]
seatemp <- select(sea_test2, 
                    LocationName, 
                    StationID, 
                    lon,
                    lat, 
                    obsrtime, 
                    MonthAvg, 
                    MonthHigh, 
                    MonthHighTime, 
                    MonthLow, 
                    MonthLowTime)

Data 2nd Processing

Re-Typing & Filter
eq_df
eq_df$originTime <- as.Date(eq_df$originTime)
eq_df$epicenterLon <- as.numeric(eq_df$epicenterLon)
eq_df$epicenterLat <- as.numeric(eq_df$epicenterLat)
eq_df$depth <- as.numeric(eq_df$depth)
eq_df$magnitudeValue <- as.numeric(eq_df$magnitudeValue)
eq_df$gap <- as.numeric(eq_df$gap)
eq_df$rms <- as.numeric(eq_df$rms)
eq_df$erh <- as.numeric(eq_df$erh)
eq_df$erz <- as.numeric(eq_df$erz)

eq_df$year <- as.numeric(substring(eq_df$originTime, 1,4))
eq_df$month <- as.numeric(substring(eq_df$originTime, 6,7))
eq_df$ym <- paste0(eq_df$year,"/",eq_df$month)
# Without NA error message
sea_level
sea_level$year <- as.numeric(substring(sea_level$yyyymm, 1,4))
sea_level$month <- as.numeric(substring(sea_level$yyyymm, 5,6))
sea_level$MeanSeaLevel <- as.numeric(sea_level$MeanSeaLevel)
sea_level$HHWL <- as.numeric(sea_level$HHWL)
sea_level$LLWL <- as.numeric(sea_level$LLWL)
# Without NA error message

sea_level <- sea_level %>% filter(.$month != 0) %>% filter(.$year <= 2017 & .$year >= 2012)
seatemp
seatemp$lon <- as.numeric(seatemp$lon)
seatemp$lat <- as.numeric(seatemp$lat)
seatemp$obsrtime_year <- as.numeric(substring(seatemp$obsrtime, 1,4))
seatemp$MonthAvg <- as.numeric(seatemp$MonthAvg)
seatemp$MonthHigh <- as.numeric(seatemp$MonthHigh)
seatemp$MonthLow <- as.numeric(seatemp$MonthLow)
# Without NA error message

New df

sub_level <- sea_level %>% filter(.$SiteName %in% c("高雄市 高雄","連江縣 馬祖","基隆市 基隆","臺中市 臺中港","臺東縣 蘭嶼")) %>% as.data.table()
sub_temp <- seatemp %>% filter(.$LocationName %in% c("高雄","馬祖","基隆","臺中港","蘭嶼"))
data.table(sub_level)
##         SiteName SiteId yyyymm MeanSeaLevel MeanSeaLevel_unit  HHWL
##   1: 臺東縣 蘭嶼   1396 201201        -67.9                cm    NA
##   2: 臺東縣 蘭嶼   1396 201202        -52.6                cm    NA
##   3: 臺東縣 蘭嶼   1396 201203        -45.0                cm    NA
##   4: 臺東縣 蘭嶼   1396 201204        -47.1                cm    NA
##   5: 臺東縣 蘭嶼   1396 201205        -37.8                cm    NA
##  ---                                                               
## 344: 連江縣 馬祖   1926 201707       -111.9                cm 199.6
## 345: 連江縣 馬祖   1926 201708       -105.8                cm 208.9
## 346: 連江縣 馬祖   1926 201709        -90.1                cm 200.1
## 347: 連江縣 馬祖   1926 201711        -92.2                cm 210.7
## 348: 連江縣 馬祖   1926 201712       -104.9                cm 214.7
##      HHWL_unit   LLWL LLWL_unit year month
##   1:        cm     NA        cm 2012     1
##   2:        cm     NA        cm 2012     2
##   3:        cm     NA        cm 2012     3
##   4:        cm     NA        cm 2012     4
##   5:        cm     NA        cm 2012     5
##  ---                                      
## 344:        cm -454.1        cm 2017     7
## 345:        cm -435.5        cm 2017     8
## 346:        cm -411.4        cm 2017     9
## 347:        cm -431.9        cm 2017    11
## 348:        cm -480.3        cm 2017    12
sub_level[,LocationName := ifelse(SiteName == "高雄市 高雄", "高雄", 
                                  ifelse(SiteName == "連江縣 馬祖", "馬祖",
                                        ifelse(SiteName == "基隆市 基隆", "基隆",
                                               ifelse(SiteName == "臺中市 臺中港", "臺中港",
                                                      ifelse(SiteName == "臺東縣 蘭嶼", "蘭嶼" ,"")
                                                      )
                                               )
                                        )
                                 )
          ]

sub_level$yyyymm <- paste0(sub_level$year,"/",sub_level$month)
sub_level_temp <- left_join(sub_level,sub_temp, by = c("LocationName","yyyymm"="obsrtime"))

sub_level_temp$ym <- gsub("/","-",sub_level_temp$yyyymm)
sub_level_temp$ym <- as.Date(as.yearmon(sub_level_temp$ym))
sub_level_temp['nag_lat'] <- -sub_level_temp['lat']
eq_level_temp <- left_join(eq_df, sub_level_temp, by = c("ym"="yyyymm"))

Plots & Statics

相關係數說明:

  • 1.0 = 完全正相關
  • 0.7 ~ 0.99 = 高度正相關
  • 0.4 ~ 0.69 = 中度正相關
  • 0.1 ~ 0.39 = 低度正相關
  • 0.1 以下 = 無相關
# `complete.obs` avoid the NA value
# `p-value` : 假設檢定、虛無假設 (Null Hypothesis, H0)
cor.test(sub_level_temp$MeanSeaLevel, sub_level_temp$MonthAvg, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  sub_level_temp$MeanSeaLevel and sub_level_temp$MonthAvg
## t = 8.0322, df = 339, p-value = 1.598e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3066752 0.4854478
## sample estimates:
##       cor 
## 0.3998575
t.test(sub_level_temp$MeanSeaLevel , sub_level_temp$MonthAvg)
## 
##  Welch Two Sample t-test
## 
## data:  sub_level_temp$MeanSeaLevel and sub_level_temp$MonthAvg
## t = -16.354, df = 355.52, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -46.39381 -36.43349
## sample estimates:
## mean of x mean of y 
## -17.74943  23.66422
plot_ly(x = ~sub_level_temp$MonthAvg, y = ~sub_level_temp$MeanSeaLevel, type = 'scatter', mode = 'markers', color = ~sub_level_temp$LocationName)
time_temp <- sub_level_temp %>% plot_ly(x = ~.$ym, y = ~.$MonthAvg, type = 'scatter', mode = 'lines+markers', color = ~.$LocationName)
time_level <- sub_level_temp %>% plot_ly(x = ~.$ym, y = ~.$MeanSeaLevel, type = 'scatter', mode = 'lines+markers', color = ~.$LocationName)
subplot(time_temp,time_level,nrows = 2)
plot_ly(x = ~sub_level_temp$nag_lat, y = ~sub_level_temp$lon, z = ~sub_level_temp$MeanSeaLevel, type = 'scatter3d', mode = 'markers', color = ~sub_level_temp$LocationName)
lon_value <- plot_ly(x = ~eq_df$epicenterLon, y = ~eq_df$magnitudeValue, name = "Lon", type = 'scatter', mode = 'markers')
lat_value <- plot_ly(x = ~eq_df$magnitudeValue, y = ~eq_df$epicenterLat, name = "Lat", type = 'scatter', mode = 'markers')
subplot(lon_value,lat_value)
t1 <- eq_level_temp %>% group_by(ym) %>% summarise(eqrthquake = n(),
                                                   Value = mean(magnitudeValue),
                                                   Depth = mean(depth),
                                                   Level = mean(MeanSeaLevel),
                                                   Temp = mean(MonthAvg))
t1$Year <- substring(t1$ym,1,4)
plot_ly(x = ~t1$Depth, y = ~t1$Value, type = "scatter", mode = 'markers')
ggplotly(ggplot(t1, aes(x = Depth, y = Value)) + geom_point() + geom_smooth())
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
subplot(plot_ly(x = ~t1$ym, y = ~t1$Value, name = "Value", type = 'scatter', mode = 'lines'),
        plot_ly(x = ~t1$ym, y = ~t1$Depth, name = "Depth", type = 'scatter', mode = 'lines'),
        plot_ly(x = ~t1$ym, y = ~t1$eqrthquake, name = "num", type = 'scatter', mode = 'lines'),
        nrows = 3)
a <- t1 %>% plot_ly(x = ~.$ym, y = ~.$eqrthquake, name = "Num", type = 'scatter', mode = 'lines')
b <- t1 %>% plot_ly(x = ~.$ym, y = ~.$Value, name = "Value", type = 'scatter', mode = 'lines')
c <- t1 %>% plot_ly(x = ~.$ym, y = ~.$Depth, name = "Depth", type = 'scatter', mode = 'lines')
d <- t1 %>% plot_ly(x = ~.$ym, y = ~.$Level, name = "Level", type = 'scatter', mode = 'lines')
e <- t1 %>% plot_ly(x = ~.$ym, y = ~.$Temp, name = "Temp", type = 'scatter', mode = 'lines')
t1 %>% plot_ly(x = ~.$Level, y = ~.$Temp, z = ~.$Depth, type = 'scatter3d', mode = 'markers', color = .$Value)
## Warning: Ignoring 8 observations

Compare

subplot(a,b, nrows = 2)
cor.test(t1$eqrthquake, t1$Value, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  t1$eqrthquake and t1$Value
## t = 1.6888, df = 71, p-value = 0.09565
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03514261  0.40812874
## sample estimates:
##       cor 
## 0.1965135
subplot(a,c,nrows = 2)
cor.test(t1$eqrthquake, t1$Depth, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  t1$eqrthquake and t1$Depth
## t = -4.3258, df = 71, p-value = 4.878e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.6214764 -0.2532533
## sample estimates:
##        cor 
## -0.4567102
subplot(a,d,nrows = 2)
cor.test(t1$eqrthquake, t1$Level, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  t1$eqrthquake and t1$Level
## t = -1.7419, df = 70, p-value = 0.08592
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.41585902  0.02922087
## sample estimates:
##        cor 
## -0.2038275
subplot(a,e,nrows = 2)
cor.test(t1$eqrthquake, t1$Temp, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  t1$eqrthquake and t1$Temp
## t = -0.31545, df = 63, p-value = 0.7535
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2808907  0.2061840
## sample estimates:
##         cor 
## -0.03971218
subplot(b,c, nrows = 2)
cor.test(t1$Value, t1$Depth, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  t1$Value and t1$Depth
## t = -1.8024, df = 71, p-value = 0.07573
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.41907271  0.02195082
## sample estimates:
##        cor 
## -0.2091728
subplot(b,d, nrows = 2)
cor.test(t1$Value, t1$Level, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  t1$Value and t1$Level
## t = -2.2927, df = 70, p-value = 0.02487
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.46734322 -0.03474785
## sample estimates:
##        cor 
## -0.2642889
subplot(b,e, nrows = 2)
cor.test(t1$Value, t1$Temp, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  t1$Value and t1$Temp
## t = -1.5681, df = 63, p-value = 0.1219
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.41796128  0.05256364
## sample estimates:
##        cor 
## -0.1938203
subplot(c,d, nrows = 2)
cor.test(t1$Depth, t1$Level, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  t1$Depth and t1$Level
## t = 0.62232, df = 70, p-value = 0.5358
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1602454  0.3006785
## sample estimates:
##       cor 
## 0.0741769
subplot(c,e, nrows = 2)
cor.test(t1$Depth, t1$Temp, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  t1$Depth and t1$Temp
## t = -0.44542, df = 63, p-value = 0.6575
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2958857  0.1904720
## sample estimates:
##         cor 
## -0.05603008
subplot(d,e, nrows = 2)
cor.test(t1$Level, t1$Temp, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  t1$Level and t1$Temp
## t = 10.523, df = 63, p-value = 1.648e-15
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6885426 0.8723934
## sample estimates:
##       cor 
## 0.7983678

Conclution

  1. 海平面與海水溫度有著很大的相關性,反觀地震與海洋之間的關聯性很低
  2. 針對地震資料來看,深度與規模的相關係數也只有 0.073,屬於無相關
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## [1] 0.07320232
## Warning: Ignoring 1 observations

3. 統計要好好學!

Problems & Solutions

  1. cannot get the table data from website “https://rdc28.cwb.gov.tw/TDB/public/warning_typhoon_list/
    Solution: Manually copy and paste to Numbers then export to csv file
  2. reading the nodes in xml
    Solution : Read the root node of each record first, then use xml_find_first & xml_find_all If take xml_find_all without reading the root node of each record, will get wrong value
    Stacloverflow_IssuePost
  3. rstudio console is displayed with the system default language
    Solution : https://stackoverflow.com/questions/56503222/how-to-data-frame-with-different-number-of-rows-but-related-not-by/56511335#56511335
  4. reading xml file using xml_find_all but nothing is pull out!
    Solution : get the JSON file for rapid using.